{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.linear_model import LinearRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def lm(x, y, data, intercept=True):\n",
    "    \"\"\"Returns the coefficients from regressing y on x.\n",
    "    \n",
    "    Inputs:\n",
    "        - x: a list containing the names of the x variables\n",
    "        - y: the name of the y variable\n",
    "        - data: a Pandas data frame (the names in x and y must be columns in this data frame)\n",
    "        - intercept: boolean indicating whether or not to include an intercept term\n",
    "        \n",
    "    Outputs: A Pandas series with the estimated coefficients, indexed by the x variable names.\n",
    "    \"\"\"\n",
    "    \n",
    "    if intercept:\n",
    "        beta = [0] * (len(x) + 1)\n",
    "        names = [\"Intercept\"] + x\n",
    "    else:\n",
    "        beta = [0] * len(x)\n",
    "        names = x\n",
    "        \n",
    "    return pd.Series(data=beta, index=names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Some Data To Test Your Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "predictors = [\"symboling\", \"normalized-losses\", \"make\", \"fuel-type\",\n",
    "              \"aspiration\", \"num-of-doors\", \"body-style\", \"drive-wheels\",\n",
    "              \"engine-location\", \"wheel-base\", \"length\", \"width\",\n",
    "              \"height\", \"curb-weight\", \"engine-type\", \"num-of-cylinders\",\n",
    "              \"engine-size\", \"fuel-system\", \"bore\", \"stroke\",\n",
    "              \"compression-ratio\", \"horsepower\", \"peak-rpm\", \"city-mpg\",\n",
    "              \"highway-mpg\"]\n",
    "data = pd.read_csv(\"http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n",
    "                  header=None,\n",
    "                  names=predictors + [\"price\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following code strips out missing values (represented by \"?\" in this data set) and converts columns to numeric types before fitting linear regression to the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(data.shape)\n",
    "\n",
    "for col in data.columns:\n",
    "    if data[col].dtype == object:\n",
    "        data = data[data[col] != \"?\"]\n",
    "        try:\n",
    "            data[col] = pd.to_numeric(data[col])\n",
    "        except:\n",
    "            pass\n",
    "    \n",
    "print(data.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test 1: Quantitative Predictors Only\n",
    "\n",
    "Let's test out the `lm` function you just wrote on some quantitative predictors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lm([\"length\", \"width\", \"height\"], \"price\", data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check that your `lm` function produces the same results as scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = LinearRegression()\n",
    "model.fit(data[[\"length\", \"width\", \"height\"]], data[\"price\"])\n",
    "model.intercept_, model.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test 2: Categorical Predictors\n",
    "\n",
    "Your `lm` function should also do the right thing for categorical variables automatically (i.e., it should expand categorical variables with $k$ levels into $k-1$ 0-1 variables automatically)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lm(predictors, \"price\", data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check that your `lm` function produces the same results as scikit-learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = LinearRegression()\n",
    "data_expanded = pd.get_dummies(data[predictors], drop_first=True)\n",
    "model.fit(data_expanded, data[\"price\"])\n",
    "model.intercept_, model.coef_"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}